Plan: - Predict released movie revenue - Predictors: - adult - popularity - budget - release date -> season - runtime - genres - casts (and their popularity?) - non-casts (and their popularity?)

!install.packages(c("httr", "dplyr", "jsonlite", "progress", "dplyr", "tidyr", "ggplot2", "knitr", "ggcorrplot", "stringr", "xgboost", "lubridate", "randomForest", "forcats", "plotly"))
library(httr)
library(dplyr)
library(jsonlite)
library(progress)
library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(ggcorrplot)
library(stringr)
library(xgboost)
library(lubridate)
library(randomForest)
library(forcats)
library(xgboost)
library(plotly)
set.seed(42)
API_KEY <- "e6d34aa453d1a71af2b6e77848923e57"
BASE_URL <- "https://api.themoviedb.org/3/"
DISCOVER_MOVIE_PATH <- "discover/movie"
MOVIE_PATH <- "movie/"

MOVIES_COUNT <- 1000
MOVIES_PER_PAGE <- 20

Fetching Data

response <- GET(
  url = paste0(BASE_URL, DISCOVER_MOVIE_PATH),
  query = list(
    sort_by = "revenue.desc",
    release_date.lte = "2025-04-25",
    api_key = API_KEY
  )
)

movie_total_results <- content(response)$total_results
movie_total_pages <- content(response)$total_pages

cat("There are", movie_total_results, "movies in total provided by the API.\n")
## There are 900253 movies in total provided by the API.
cat("These movies are paginated into", movie_total_pages, "pages", "with", MOVIES_PER_PAGE, "movies per page.\n")
## These movies are paginated into 45013 pages with 20 movies per page.
# response <- GET(
#   url = paste0(BASE_URL, DISCOVER_MOVIE_PATH),
#   query = list(
#     sort_by = "revenue.desc",
#     release_date.lte = "2025-04-25",
#     api_key = API_KEY
#   )
# )
# 
# movie_total_results <- content(response)$total_results
# movie_total_pages <- content(response)$total_pages
# 
# cat("There are", movie_total_results, "movies in total provided by the API.\n")
# cat("These movies are paginated into", movie_total_pages, "pages", "with", MOVIES_PER_PAGE, "movies per page.\n")

Fetch movie ids and titles

movies <- data.frame()
n_pages <- ceiling(MOVIES_COUNT / MOVIES_PER_PAGE)
pb <- txtProgressBar(min = 0, max = n_pages, style = 3)
##   |                                                                              |                                                                      |   0%
for (page in 1:n_pages) {
  response <- GET(
    url = paste0(BASE_URL, DISCOVER_MOVIE_PATH),
    query = list(
      sort_by = "revenue.desc",
      page = page,
      api_key = API_KEY
    )
  )
  
  # Check if API request was successful
  if (status_code(response) != 200) {
    print(paste("Skipping page", page, "- API error:", status_code(response)))
    next
  }
  
  data <- fromJSON(content(response, as = "text"))$results
  page_movies <- data.frame(
    movie_id = data$id,
    title = data$title
  )
  movies <- rbind(movies, page_movies)
  setTxtProgressBar(pb, page)
}
##   |                                                                              |=                                                                     |   2%  |                                                                              |===                                                                   |   4%  |                                                                              |====                                                                  |   6%  |                                                                              |======                                                                |   8%  |                                                                              |=======                                                               |  10%  |                                                                              |========                                                              |  12%  |                                                                              |==========                                                            |  14%  |                                                                              |===========                                                           |  16%  |                                                                              |=============                                                         |  18%  |                                                                              |==============                                                        |  20%  |                                                                              |===============                                                       |  22%  |                                                                              |=================                                                     |  24%  |                                                                              |==================                                                    |  26%  |                                                                              |====================                                                  |  28%  |                                                                              |=====================                                                 |  30%  |                                                                              |======================                                                |  32%  |                                                                              |========================                                              |  34%  |                                                                              |=========================                                             |  36%  |                                                                              |===========================                                           |  38%  |                                                                              |============================                                          |  40%  |                                                                              |=============================                                         |  42%  |                                                                              |===============================                                       |  44%  |                                                                              |================================                                      |  46%  |                                                                              |==================================                                    |  48%  |                                                                              |===================================                                   |  50%  |                                                                              |====================================                                  |  52%  |                                                                              |======================================                                |  54%  |                                                                              |=======================================                               |  56%  |                                                                              |=========================================                             |  58%  |                                                                              |==========================================                            |  60%  |                                                                              |===========================================                           |  62%  |                                                                              |=============================================                         |  64%  |                                                                              |==============================================                        |  66%  |                                                                              |================================================                      |  68%  |                                                                              |=================================================                     |  70%  |                                                                              |==================================================                    |  72%  |                                                                              |====================================================                  |  74%  |                                                                              |=====================================================                 |  76%  |                                                                              |=======================================================               |  78%  |                                                                              |========================================================              |  80%  |                                                                              |=========================================================             |  82%  |                                                                              |===========================================================           |  84%  |                                                                              |============================================================          |  86%  |                                                                              |==============================================================        |  88%  |                                                                              |===============================================================       |  90%  |                                                                              |================================================================      |  92%  |                                                                              |==================================================================    |  94%  |                                                                              |===================================================================   |  96%  |                                                                              |===================================================================== |  98%  |                                                                              |======================================================================| 100%
close(pb)
cat("Successfully fetch all movie ids and titles!")
## Successfully fetch all movie ids and titles!

Fetch movie details (revenue, adult, popularity, budget, release date, runtime, genres)

pb <- txtProgressBar(min = 0, max = nrow(movies), style = 3)
##   |                                                                              |                                                                      |   0%
all_genres <- c()

for (i in 1:nrow(movies)) {
  movie_id <- movies$movie_id[i]
  
  response <- GET(
    url = paste0(BASE_URL, MOVIE_PATH, movie_id),
    query = list(
      api_key = API_KEY
    )
  )
  
  # Check if API request was successful
  if (status_code(response) != 200) {
    print(paste("Skipping movie", movie_id, "- API error:", status_code(response)))
    next
  }
  
  data <- fromJSON(content(response, as = "text"))

  movies$revenue[i] <- data$revenue
  movies$adult[i] <- as.logical(data$adult)
  movies$popularity[i] <- data$popularity
  movies$budget[i] <- data$budget
  movies$release_date[i] <- data$release_date
  movies$runtime[i] <- data$runtime

  genres <- data$genres
  movies$genres[i] <- paste(genres$name, collapse = ",")
  for (genre in genres$name) {
    all_genres <- unique(c(all_genres, genre))
  }
  setTxtProgressBar(pb, i)
}
##   |                                                                              |                                                                      |   1%  |                                                                              |=                                                                     |   1%  |                                                                              |=                                                                     |   2%  |                                                                              |==                                                                    |   2%  |                                                                              |==                                                                    |   3%  |                                                                              |==                                                                    |   4%  |                                                                              |===                                                                   |   4%  |                                                                              |===                                                                   |   5%  |                                                                              |====                                                                  |   5%  |                                                                              |====                                                                  |   6%  |                                                                              |=====                                                                 |   6%  |                                                                              |=====                                                                 |   7%  |                                                                              |=====                                                                 |   8%  |                                                                              |======                                                                |   8%  |                                                                              |======                                                                |   9%  |                                                                              |=======                                                               |   9%  |                                                                              |=======                                                               |  10%  |                                                                              |=======                                                               |  11%  |                                                                              |========                                                              |  11%  |                                                                              |========                                                              |  12%  |                                                                              |=========                                                             |  12%  |                                                                              |=========                                                             |  13%  |                                                                              |=========                                                             |  14%  |                                                                              |==========                                                            |  14%  |                                                                              |==========                                                            |  15%  |                                                                              |===========                                                           |  15%  |                                                                              |===========                                                           |  16%  |                                                                              |============                                                          |  16%  |                                                                              |============                                                          |  17%  |                                                                              |============                                                          |  18%  |                                                                              |=============                                                         |  18%  |                                                                              |=============                                                         |  19%  |                                                                              |==============                                                        |  19%  |                                                                              |==============                                                        |  20%  |                                                                              |==============                                                        |  21%  |                                                                              |===============                                                       |  21%  |                                                                              |===============                                                       |  22%  |                                                                              |================                                                      |  22%  |                                                                              |================                                                      |  23%  |                                                                              |================                                                      |  24%  |                                                                              |=================                                                     |  24%  |                                                                              |=================                                                     |  25%  |                                                                              |==================                                                    |  25%  |                                                                              |==================                                                    |  26%  |                                                                              |===================                                                   |  26%  |                                                                              |===================                                                   |  27%  |                                                                              |===================                                                   |  28%  |                                                                              |====================                                                  |  28%  |                                                                              |====================                                                  |  29%  |                                                                              |=====================                                                 |  29%  |                                                                              |=====================                                                 |  30%  |                                                                              |=====================                                                 |  31%  |                                                                              |======================                                                |  31%  |                                                                              |======================                                                |  32%  |                                                                              |=======================                                               |  32%  |                                                                              |=======================                                               |  33%  |                                                                              |=======================                                               |  34%  |                                                                              |========================                                              |  34%  |                                                                              |========================                                              |  35%  |                                                                              |=========================                                             |  35%  |                                                                              |=========================                                             |  36%  |                                                                              |==========================                                            |  36%  |                                                                              |==========================                                            |  37%  |                                                                              |==========================                                            |  38%  |                                                                              |===========================                                           |  38%  |                                                                              |===========================                                           |  39%  |                                                                              |============================                                          |  39%  |                                                                              |============================                                          |  40%  |                                                                              |============================                                          |  41%  |                                                                              |=============================                                         |  41%  |                                                                              |=============================                                         |  42%  |                                                                              |==============================                                        |  42%  |                                                                              |==============================                                        |  43%  |                                                                              |==============================                                        |  44%  |                                                                              |===============================                                       |  44%  |                                                                              |===============================                                       |  45%  |                                                                              |================================                                      |  45%  |                                                                              |================================                                      |  46%  |                                                                              |=================================                                     |  46%  |                                                                              |=================================                                     |  47%  |                                                                              |=================================                                     |  48%  |                                                                              |==================================                                    |  48%  |                                                                              |==================================                                    |  49%  |                                                                              |===================================                                   |  49%  |                                                                              |===================================                                   |  50%  |                                                                              |===================================                                   |  51%  |                                                                              |====================================                                  |  51%  |                                                                              |====================================                                  |  52%  |                                                                              |=====================================                                 |  52%  |                                                                              |=====================================                                 |  53%  |                                                                              |=====================================                                 |  54%  |                                                                              |======================================                                |  54%  |                                                                              |======================================                                |  55%  |                                                                              |=======================================                               |  55%  |                                                                              |=======================================                               |  56%  |                                                                              |========================================                              |  56%  |                                                                              |========================================                              |  57%  |                                                                              |========================================                              |  58%  |                                                                              |=========================================                             |  58%  |                                                                              |=========================================                             |  59%  |                                                                              |==========================================                            |  59%  |                                                                              |==========================================                            |  60%  |                                                                              |==========================================                            |  61%  |                                                                              |===========================================                           |  61%  |                                                                              |===========================================                           |  62%  |                                                                              |============================================                          |  62%  |                                                                              |============================================                          |  63%  |                                                                              |============================================                          |  64%  |                                                                              |=============================================                         |  64%  |                                                                              |=============================================                         |  65%  |                                                                              |==============================================                        |  65%  |                                                                              |==============================================                        |  66%  |                                                                              |===============================================                       |  66%  |                                                                              |===============================================                       |  67%  |                                                                              |===============================================                       |  68%  |                                                                              |================================================                      |  68%  |                                                                              |================================================                      |  69%  |                                                                              |=================================================                     |  69%  |                                                                              |=================================================                     |  70%  |                                                                              |=================================================                     |  71%  |                                                                              |==================================================                    |  71%  |                                                                              |==================================================                    |  72%  |                                                                              |===================================================                   |  72%  |                                                                              |===================================================                   |  73%  |                                                                              |===================================================                   |  74%  |                                                                              |====================================================                  |  74%  |                                                                              |====================================================                  |  75%  |                                                                              |=====================================================                 |  75%  |                                                                              |=====================================================                 |  76%  |                                                                              |======================================================                |  76%  |                                                                              |======================================================                |  77%  |                                                                              |======================================================                |  78%  |                                                                              |=======================================================               |  78%  |                                                                              |=======================================================               |  79%  |                                                                              |========================================================              |  79%  |                                                                              |========================================================              |  80%  |                                                                              |========================================================              |  81%  |                                                                              |=========================================================             |  81%  |                                                                              |=========================================================             |  82%  |                                                                              |==========================================================            |  82%  |                                                                              |==========================================================            |  83%  |                                                                              |==========================================================            |  84%  |                                                                              |===========================================================           |  84%  |                                                                              |===========================================================           |  85%  |                                                                              |============================================================          |  85%  |                                                                              |============================================================          |  86%  |                                                                              |=============================================================         |  86%  |                                                                              |=============================================================         |  87%  |                                                                              |=============================================================         |  88%  |                                                                              |==============================================================        |  88%  |                                                                              |==============================================================        |  89%  |                                                                              |===============================================================       |  89%  |                                                                              |===============================================================       |  90%  |                                                                              |===============================================================       |  91%  |                                                                              |================================================================      |  91%  |                                                                              |================================================================      |  92%  |                                                                              |=================================================================     |  92%  |                                                                              |=================================================================     |  93%  |                                                                              |=================================================================     |  94%  |                                                                              |==================================================================    |  94%  |                                                                              |==================================================================    |  95%  |                                                                              |===================================================================   |  95%  |                                                                              |===================================================================   |  96%  |                                                                              |====================================================================  |  96%  |                                                                              |====================================================================  |  97%  |                                                                              |====================================================================  |  98%  |                                                                              |===================================================================== |  98%  |                                                                              |===================================================================== |  99%  |                                                                              |======================================================================|  99%  |                                                                              |======================================================================| 100%
close(pb)
cat("Successfully fetch movie details!")
## Successfully fetch movie details!
all_genres
##  [1] "Action"          "Adventure"       "Fantasy"         "Science Fiction"
##  [5] "Drama"           "Romance"         "Animation"       "Comedy"         
##  [9] "Family"          "Thriller"        "Crime"           "History"        
## [13] "Music"           "War"             "Mystery"         "Horror"         
## [17] "Western"         "Documentary"

Fetch movie credits (casts and non-casts)

cat(paste0(BASE_URL, MOVIE_PATH, 1231, "credits"))
## https://api.themoviedb.org/3/movie/1231credits
pb <- txtProgressBar(min = 0, max = nrow(movies), style = 3)
##   |                                                                              |                                                                      |   0%
movies$cast <- NA       # Stores top 5 actors
movies$non_cast <- NA   # Stores top 5 crew members (excluding actors)

for (i in 1:nrow(movies)) {
  movie_id <- movies$movie_id[i]
  
  response <- GET(
    url = paste0(BASE_URL, MOVIE_PATH, movie_id, "/credits"),
    query = list(
      api_key = API_KEY 
    )
  )
  
  # Check if API request was successful
  if (status_code(response) != 200) {
    print(paste("Skipping movie", movie_id, "- API error:", status_code(response)))
    next
  }

  credits <- fromJSON(content(response, as = "text"))

  # Extract CAST (Actors)
  if (!is.null(credits$cast) && length(credits$cast) > 0) {
    # Filter only actors (those with a "character" field)
    cast_members <- credits$cast[!is.null(credits$cast$character), ]
    
    top_cast <- head(cast_members, 5)
    
    # Format: "Actor1 (popularity), Actor2 (popularity)"
    movies$cast[i] <- paste0(top_cast$name, "(", round(top_cast$popularity, 1), ")", collapse = ", ")
    movies$actor_avg_pop[i] <- mean(top_cast$popularity)
  } else {
    movies$cast[i] <- NA  # Handle missing actor
    movies$actor_avg_pop[i] <- NA
  }
  
  # Extract NON-CAST (Directors, Producers, etc.)
  if (!is.null(credits$crew) && length(credits$crew) > 0) {
    # Exclude actors (crew members without "character" role)
    non_cast_members <- credits$crew[is.null(credits$crew$character), ]
    
    top_non_cast <- head(non_cast_members, 5)
    
    # Format: "Crew1 (popularity), Crew2 (popularity)"
    movies$non_cast[i] <- paste0(top_non_cast$name, "(", round(top_non_cast$popularity, 1), ")", collapse = ", ")
    movies$crew_avg_pop[i] <- mean(top_non_cast$popularity)
  } else {
    movies$non_cast[i] <- NA  # Handle missing crew
    movies$crew_avg_pop[i] <- NA
  }

  setTxtProgressBar(pb, i)
}
##   |                                                                              |                                                                      |   1%  |                                                                              |=                                                                     |   1%  |                                                                              |=                                                                     |   2%  |                                                                              |==                                                                    |   2%  |                                                                              |==                                                                    |   3%  |                                                                              |==                                                                    |   4%  |                                                                              |===                                                                   |   4%  |                                                                              |===                                                                   |   5%  |                                                                              |====                                                                  |   5%  |                                                                              |====                                                                  |   6%  |                                                                              |=====                                                                 |   6%  |                                                                              |=====                                                                 |   7%  |                                                                              |=====                                                                 |   8%  |                                                                              |======                                                                |   8%  |                                                                              |======                                                                |   9%  |                                                                              |=======                                                               |   9%  |                                                                              |=======                                                               |  10%  |                                                                              |=======                                                               |  11%  |                                                                              |========                                                              |  11%  |                                                                              |========                                                              |  12%  |                                                                              |=========                                                             |  12%  |                                                                              |=========                                                             |  13%  |                                                                              |=========                                                             |  14%  |                                                                              |==========                                                            |  14%  |                                                                              |==========                                                            |  15%  |                                                                              |===========                                                           |  15%  |                                                                              |===========                                                           |  16%  |                                                                              |============                                                          |  16%  |                                                                              |============                                                          |  17%  |                                                                              |============                                                          |  18%  |                                                                              |=============                                                         |  18%  |                                                                              |=============                                                         |  19%  |                                                                              |==============                                                        |  19%  |                                                                              |==============                                                        |  20%  |                                                                              |==============                                                        |  21%  |                                                                              |===============                                                       |  21%  |                                                                              |===============                                                       |  22%  |                                                                              |================                                                      |  22%  |                                                                              |================                                                      |  23%  |                                                                              |================                                                      |  24%  |                                                                              |=================                                                     |  24%  |                                                                              |=================                                                     |  25%  |                                                                              |==================                                                    |  25%  |                                                                              |==================                                                    |  26%  |                                                                              |===================                                                   |  26%  |                                                                              |===================                                                   |  27%  |                                                                              |===================                                                   |  28%  |                                                                              |====================                                                  |  28%  |                                                                              |====================                                                  |  29%  |                                                                              |=====================                                                 |  29%  |                                                                              |=====================                                                 |  30%  |                                                                              |=====================                                                 |  31%  |                                                                              |======================                                                |  31%  |                                                                              |======================                                                |  32%  |                                                                              |=======================                                               |  32%  |                                                                              |=======================                                               |  33%  |                                                                              |=======================                                               |  34%  |                                                                              |========================                                              |  34%  |                                                                              |========================                                              |  35%  |                                                                              |=========================                                             |  35%  |                                                                              |=========================                                             |  36%  |                                                                              |==========================                                            |  36%  |                                                                              |==========================                                            |  37%  |                                                                              |==========================                                            |  38%  |                                                                              |===========================                                           |  38%  |                                                                              |===========================                                           |  39%  |                                                                              |============================                                          |  39%  |                                                                              |============================                                          |  40%  |                                                                              |============================                                          |  41%  |                                                                              |=============================                                         |  41%  |                                                                              |=============================                                         |  42%  |                                                                              |==============================                                        |  42%  |                                                                              |==============================                                        |  43%  |                                                                              |==============================                                        |  44%  |                                                                              |===============================                                       |  44%  |                                                                              |===============================                                       |  45%  |                                                                              |================================                                      |  45%  |                                                                              |================================                                      |  46%  |                                                                              |=================================                                     |  46%  |                                                                              |=================================                                     |  47%  |                                                                              |=================================                                     |  48%  |                                                                              |==================================                                    |  48%  |                                                                              |==================================                                    |  49%  |                                                                              |===================================                                   |  49%  |                                                                              |===================================                                   |  50%  |                                                                              |===================================                                   |  51%  |                                                                              |====================================                                  |  51%  |                                                                              |====================================                                  |  52%  |                                                                              |=====================================                                 |  52%  |                                                                              |=====================================                                 |  53%  |                                                                              |=====================================                                 |  54%  |                                                                              |======================================                                |  54%  |                                                                              |======================================                                |  55%  |                                                                              |=======================================                               |  55%  |                                                                              |=======================================                               |  56%  |                                                                              |========================================                              |  56%  |                                                                              |========================================                              |  57%  |                                                                              |========================================                              |  58%  |                                                                              |=========================================                             |  58%  |                                                                              |=========================================                             |  59%  |                                                                              |==========================================                            |  59%  |                                                                              |==========================================                            |  60%  |                                                                              |==========================================                            |  61%  |                                                                              |===========================================                           |  61%  |                                                                              |===========================================                           |  62%  |                                                                              |============================================                          |  62%  |                                                                              |============================================                          |  63%  |                                                                              |============================================                          |  64%  |                                                                              |=============================================                         |  64%  |                                                                              |=============================================                         |  65%  |                                                                              |==============================================                        |  65%  |                                                                              |==============================================                        |  66%  |                                                                              |===============================================                       |  66%  |                                                                              |===============================================                       |  67%  |                                                                              |===============================================                       |  68%  |                                                                              |================================================                      |  68%  |                                                                              |================================================                      |  69%  |                                                                              |=================================================                     |  69%  |                                                                              |=================================================                     |  70%  |                                                                              |=================================================                     |  71%  |                                                                              |==================================================                    |  71%  |                                                                              |==================================================                    |  72%  |                                                                              |===================================================                   |  72%  |                                                                              |===================================================                   |  73%  |                                                                              |===================================================                   |  74%  |                                                                              |====================================================                  |  74%  |                                                                              |====================================================                  |  75%  |                                                                              |=====================================================                 |  75%  |                                                                              |=====================================================                 |  76%  |                                                                              |======================================================                |  76%  |                                                                              |======================================================                |  77%  |                                                                              |======================================================                |  78%  |                                                                              |=======================================================               |  78%  |                                                                              |=======================================================               |  79%  |                                                                              |========================================================              |  79%  |                                                                              |========================================================              |  80%  |                                                                              |========================================================              |  81%  |                                                                              |=========================================================             |  81%  |                                                                              |=========================================================             |  82%  |                                                                              |==========================================================            |  82%  |                                                                              |==========================================================            |  83%  |                                                                              |==========================================================            |  84%  |                                                                              |===========================================================           |  84%  |                                                                              |===========================================================           |  85%  |                                                                              |============================================================          |  85%  |                                                                              |============================================================          |  86%  |                                                                              |=============================================================         |  86%  |                                                                              |=============================================================         |  87%  |                                                                              |=============================================================         |  88%  |                                                                              |==============================================================        |  88%  |                                                                              |==============================================================        |  89%  |                                                                              |===============================================================       |  89%  |                                                                              |===============================================================       |  90%  |                                                                              |===============================================================       |  91%  |                                                                              |================================================================      |  91%  |                                                                              |================================================================      |  92%  |                                                                              |=================================================================     |  92%  |                                                                              |=================================================================     |  93%  |                                                                              |=================================================================     |  94%  |                                                                              |==================================================================    |  94%  |                                                                              |==================================================================    |  95%  |                                                                              |===================================================================   |  95%  |                                                                              |===================================================================   |  96%  |                                                                              |====================================================================  |  96%  |                                                                              |====================================================================  |  97%  |                                                                              |====================================================================  |  98%  |                                                                              |===================================================================== |  98%  |                                                                              |===================================================================== |  99%  |                                                                              |======================================================================|  99%  |                                                                              |======================================================================| 100%
close(pb)

Change column names

colnames(movies) <- c(
  "movie_id",      # Unique movie ID
  "title",         # Movie title
  "revenue",       # Box office revenue
  "adult",         # Whether the movie is rated adult (TRUE/FALSE)
  "popularity",    # Popularity score from TMDB
  "budget",        # Production budget
  "release_date",  # Release date
  "runtime",       # Duration in minutes
  "genres",        # Movie genres
  "actors",        # Top 5 actors
  "crews",         # Top 5 crew members
  "actor_avg_pop", # Average popularity of the top 5 actors
  "crew_avg_pop"   # Average popularity of the top 5 crew members
)
colnames(movies)
##  [1] "movie_id"      "title"         "revenue"       "adult"        
##  [5] "popularity"    "budget"        "release_date"  "runtime"      
##  [9] "genres"        "actors"        "crews"         "actor_avg_pop"
## [13] "crew_avg_pop"

Cleaning the data

Store a copy of the original data

movies_original <- movies

Convert release_date to date

movies$release_date <- as.Date(movies$release_date)

Check for missing values

colSums(is.na(movies))
##      movie_id         title       revenue         adult    popularity 
##             0             0             0             0             0 
##        budget  release_date       runtime        genres        actors 
##             0             6             0             0             3 
##         crews actor_avg_pop  crew_avg_pop 
##             2             3             2
cat("Summary for movie_id:\n")
## Summary for movie_id:
summary(movies$movie_id)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      11    4560   49033  201106  330111 1462748
cat("Number of empty title: ", sum(movies$title == ""), "\n")
## Number of empty title:  0
cat("Summary for revenue:\n")
## Summary for revenue:
summary(movies$revenue)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 1.915e+08 2.448e+08 3.364e+08 4.421e+08 5.087e+08 2.924e+09
cat("Unique values of adult:\n")
## Unique values of adult:
unique(movies$adult)
## [1] FALSE
cat("Summary for popularity:\n")
## Summary for popularity:
summary(movies$popularity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   7.047  11.082  15.143  16.706 485.233
cat("Summary for budget:\n")
## Summary for budget:
summary(movies$budget)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0  40000000  84000000  96251576 145000000 460000000
cat("Summary for release_date:\n")
## Summary for release_date:
summary(movies$release_date)
##         Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
## "1939-12-15" "2002-12-18" "2011-09-03" "2009-05-06" "2017-07-03" "2025-03-31" 
##         NA's 
##          "6"
cat("Summary for runtime:\n")
## Summary for runtime:
summary(movies$runtime)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0     102     116     118     131     233
cat("Unique values of genres:\n")
## Unique values of genres:
unique(unique(unlist(strsplit(movies$genres, ","))))
##  [1] "Action"          "Adventure"       "Fantasy"         "Science Fiction"
##  [5] "Drama"           "Romance"         "Animation"       "Comedy"         
##  [9] "Family"          "Thriller"        "Crime"           "History"        
## [13] "Music"           "War"             "Mystery"         "Horror"         
## [17] "Western"         "Documentary"

We observe that all values are normal, except that adult is always false, some movies have 0 budget, some movies have 0 runtime, some movies don’t have any actor or crew, and there are no release date for 4 movies.

Adult is always false

Since adult is always false, we’re not going to include it into our analysis since it doesn’t provide any insight.

movies <- select(movies, -adult)

0 budget movies

zero_budget <- movies$budget == 0
movies[zero_budget,]
##     movie_id
## 56   1452397
## 163  1066298
## 257  1462748
## 286  1299537
## 294  1228891
## 302   801803
## 313   612845
## 323  1357305
## 338   715904
## 386   923192
## 395   334298
## 415   538331
## 449   497984
## 455   372058
## 498  1147596
## 502    15947
## 507   479306
## 530   916224
## 695   783675
## 714  1089654
## 746   605375
## 765   635389
## 778  1022145
## 802   170882
## 805   362682
## 820   575813
## 825   457251
## 835  1078012
## 863   716815
## 869   170657
## 899   803266
## 905   508747
## 928   513692
## 952   168626
## 957  1462164
## 960   240417
## 979   810693
## 995  1300945
##                                                                        title
## 56                                                          No Need For Help
## 163                                                           Full River Red
## 257                                                               DUDURITMUS
## 286                                                                Successor
## 294                                                                Pegasus 2
## 302                                                                 Moon Man
## 313                                                    My People, My Country
## 323                                                 Detective Chinatown 1900
## 338 Metallica: WorldWired Tour - Live in Manchester, England - June 18, 2019
## 386                                                         Too Cool to Kill
## 395                                                             Monster Hunt
## 415                                                    Hello Mr. Billionaire
## 449                                                           Monster Hunt 2
## 455                                                               Your Name.
## 498                                                               Article 20
## 502                                                     The Three Caballeros
## 507                                                            Never Say Die
## 530                                                                   Suzume
## 695                                                      The First Slam Dunk
## 714                                                                 Chang'an
## 746                                                              The Bravest
## 765                                                               Jiang Ziya
## 778                                                              Home Coming
## 802                                                           The Black Hand
## 805                                                        Goodbye Mr. Loser
## 820                                                              Better Days
## 825                                                                    Youth
## 835                                              Boonie Bears: Guardian Code
## 863                                                      A Little Red Flower
## 869                               Journey to the West: Conquering the Demons
## 899                                                                Nice View
## 905                                                              Us and Them
## 928                                                              Raging Fire
## 952                                                                     DeAD
## 957                                                  Jigsaw Prank Gone Wrong
## 960                                                         Seeds of Destiny
## 979                                                         Jujutsu Kaisen 0
## 995                                                   A Place Called Silence
##        revenue popularity budget release_date runtime
## 56  1000000000     0.0214      0         <NA>      10
## 163  673556758     0.9452      0   2023-01-22     157
## 257  500000000     0.0000      0         <NA>      24
## 286  473859800     1.6694      0   2024-07-13     133
## 294  468924000     1.2722      0   2024-02-10     121
## 302  460300583     1.4097      0   2022-07-29     122
## 313  450064993     2.0312      0   2019-09-30     154
## 323  441460000     4.8650      0   2025-01-29     135
## 338  426900000     0.9987      0   2020-06-08     150
## 386  393014816     0.4171      0   2022-02-01     109
## 395  387053506     2.4567      0   2015-07-16     118
## 415  376000000     1.3348      0   2018-07-27     118
## 449  361682618     2.6082      0   2018-02-16     110
## 455  358000000    21.6188      0   2016-08-26     106
## 498  337554287     2.0562      0   2024-02-10     141
## 502  336000000     2.1768      0   1944-12-21      71
## 507  334530869     0.4672      0   2017-09-30     100
## 530  323638107    16.3569      0   2022-11-11     121
## 695  258000000     8.4948      0   2022-12-03     125
## 714  252520750     1.8860      0   2023-07-02     168
## 746  245179562     2.2259      0   2019-08-29     118
## 765  240656068     2.5244      0   2020-10-01     105
## 778  236085820     1.1072      0   2022-09-30     137
## 802  230000000     0.7611      0   1973-03-16      90
## 805  228549747     1.3488      0   2015-09-30     104
## 820  225879065     5.1351      0   2019-10-25     135
## 825  224160000     1.5422      0   2017-12-15     136
## 835  221982308     3.4369      0   2023-01-22      94
## 863  216000000     0.9708      0   2020-12-31     128
## 869  215637183     2.9811      0   2013-02-07     110
## 899  211018982     1.6484      0   2022-02-01     106
## 905  209221331     4.0993      0   2018-04-28     120
## 928  205000000     3.3429      0   2021-07-28     125
## 952  201103001     0.3323      0   2013-02-10     104
## 957  200000000     0.0000      0   2024-12-30       5
## 960  200000000     0.5544      0   1946-01-01      20
## 979  195870885    19.7894      0   2021-12-24     105
## 995  192856689     2.6183      0   2024-07-03     119
##                                                genres
## 56                                                   
## 163                   Mystery,Comedy,Thriller,History
## 257                                                  
## 286                                            Comedy
## 294                            Drama,Comedy,Adventure
## 302                      Science Fiction,Comedy,Drama
## 313                                     Drama,History
## 323                             Comedy,Mystery,Action
## 338                                             Music
## 386                                     Action,Comedy
## 395                          Comedy,Fantasy,Adventure
## 415                                            Comedy
## 449                          Adventure,Comedy,Fantasy
## 455                           Animation,Romance,Drama
## 498                                      Drama,Comedy
## 502                            Animation,Family,Music
## 507                                    Comedy,Fantasy
## 530                 Animation,Drama,Adventure,Fantasy
## 695                            Animation,Comedy,Drama
## 714                                 Animation,History
## 746                                      Drama,Action
## 765                Animation,Action,Fantasy,Adventure
## 778                                         Drama,War
## 802                                             Crime
## 805                                    Romance,Comedy
## 820                               Drama,Crime,Romance
## 825                                     Drama,History
## 835 Animation,Comedy,Science Fiction,Family,Adventure
## 863                                      Drama,Family
## 869                   Action,Fantasy,Adventure,Comedy
## 899                               Drama,Family,Comedy
## 905                                     Drama,Romance
## 928           Action,Crime,Thriller,Adventure,Mystery
## 952                             Comedy,Drama,Thriller
## 957                                            Comedy
## 960                                   Documentary,War
## 979                          Animation,Action,Fantasy
## 995                            Crime,Mystery,Thriller
##                                                                                                          actors
## 56                                                                                                     Sam N(0)
## 163                            Teng Shen(1), Jackson Yee(2.3), Zhang Yi(0.7), Lei Jiayin(0.7), Yue Yunpeng(0.4)
## 257                                                                                                        <NA>
## 286                                 Teng Shen(1), Ma Li(0.6), Shi Pengyuan(0.3), Sa Rina(0.5), Xiao Bochen(0.1)
## 294                        Teng Shen(1), Yin Zheng(0.7), Zhang Benyu(0.2), Fan Chengcheng(0.7), Sun Yizhou(0.5)
## 302                               Teng Shen(1), Ma Li(0.6), Chang Yuan(0.3), Li Chengru(0.6), Cailun Huang(0.2)
## 313                                        Ge You(0.5), Huang Bo(1), Zhang Yi(0.7), Simon Yam(1), Song Jia(1.6)
## 323                   Wang Baoqiang(0.7), Liu Haoran(0.8), Chow Yun-Fat(1.9), White-K(0.6), Zhang Xincheng(1.8)
## 338                              James Hetfield(0.5), Lars Ulrich(0.2), Kirk Hammett(0.3), Robert Trujillo(0.2)
## 386                         Ma Li(0.6), Wei Xiang(0.5), Allen Ai Lun(0.5), Chen Minghao(0.4), Cailun Huang(0.2)
## 395                Bai Baihe(0.6), Jing Boran(0.8), Jiang Wu(0.6), Elaine Jin Yan-Ling(0.4), Wallace Chung(0.7)
## 415                      Teng Shen(1), Vivian Sung(0.5), Zhang Yiming(0.1), Morning Chang(0.4), Chang Yuan(0.3)
## 449                     Tony Leung(1.5), Bai Baihe(0.6), Jing Boran(0.8), Chris Lee Yuchun(0.5), Tony Yang(0.9)
## 455     Ryunosuke Kamiki(1.1), Mone Kamishiraishi(1.3), Ryo Narita(0.8), Aoi Yuki(1.6), Nobunaga Shimazaki(1.5)
## 498                                 Lei Jiayin(0.7), Ma Li(0.6), Gao Ye(0.7), Zhao Liying(5.9), Liu Yaowen(0.2)
## 502     Clarence Nash(0.5), Sterling Holloway(0.6), Joaquin Garay(0.1), José Oliveira(0.1), Aurora Miranda(0.2)
## 507                               Allen Ai Lun(0.5), Ma Li(0.6), Teng Shen(1), Xue Haowen(0.4), Chang Yuan(0.3)
## 530                Nanoka Hara(0.6), Hokuto Matsumura(0.7), Eri Fukatsu(1), Shota Sometani(0.7), Sairi Ito(0.9)
## 695            Shugo Nakamura(0.4), Jun Kasama(0.4), Kenta Miyake(1.3), Shinichiro Kamio(0.6), Subaru Kimura(1)
## 714                  Yang Tianxiang(0.4), Ling Zhenhe(0.2), Junquan Wu(0.1), Xuan Xiaoming(0.2), Lu Lifeng(0.1)
## 746                                Huang Xiaoming(1.2), Du Jiang(0.4), Tan Zhuo(0.5), Yang Zi(3.5), Ou Hao(3.7)
## 765                        Zheng Xi(0.1), Jiang Guangtao(0.5), Ji Guanlin(0.2), Sheng Feng(0.5), Yang Ning(0.3)
## 778                         Zhang Yi(0.7), Karry Wang(0.8), Yin Tao(0.9), Cheng Taishen(0.4), Zhang Zixian(0.5)
## 802 Lionel Stander(0.5), Rosanna Fratello(0.1), Michele Placido(0.7), Corrado Gaipa(0.2), Annie Carol Edel(0.2)
## 805                                Teng Shen(1), Ma Li(0.6), Allen Ai Lun(0.5), Yin Zheng(0.7), Chang Yuan(0.3)
## 820                                Zhou Dongyu(1), Jackson Yee(2.3), Yin Fang(0.4), Huang Jue(0.6), Wu Yue(0.4)
## 825                        Huang Xuan(0.9), Miao Miao(0.4), Elane Zhong(1.3), Yang Caiyu(0.4), Li Xiaofeng(0.3)
## 835                           Zhang Bingjun(0.1), Zhang Wei(0.1), Yingying Miu(0.1), Tan Xiao(0.2), 贾晨露(0.1)
## 863                           Jackson Yee(2.3), Liu Haocun(1.4), Gao Yalin(0.4), Zhu Yuanyuan(0.3), Yu Xia(0.7)
## 869                               Wen Zhang(0.4), Shu Qi(2.9), Huang Bo(1), Show Lo(0.5), Lee Sheung-Ching(0.2)
## 899                                   Jackson Yee(2.3), Tian Yu(0.4), Zhang Yu(0.6), Eric Wang(0.6), Qi Xi(0.6)
## 905                 Jing Boran(0.8), Zhou Dongyu(1), Tian Zhuangzhuang(0.3), Qu Zheming(0.2), Zhang Zixian(0.5)
## 928                  Donnie Yen(2.7), Nicholas Tse Ting-Fung(1.8), Qin Lan(1.4), Ray Lui(1.1), Patrick Tam(0.9)
## 952         Tilman Strauss(0.1), Thomas Schendel(0.1), Judith Rosmair(0.1), Niklas Kohrt(0.1), Ruby O. Fee(1.6)
## 957                                                                             Samuel Day(0), Henry Kieffer(0)
## 960                                                                                          Ralph Bellamy(0.7)
## 979       Megumi Ogata(1.7), Kana Hanazawa(2), Yuichi Nakamura(1.4), Takahiro Sakurai(2.3), Mikako Komatsu(1.4)
## 995                Eric Wang(0.6), Janine Chang(0.9), Francis Ng Chun-Yu(1.7), Wang Shengdi(0.7), Cai Ming(0.5)
##                                                                                                                           crews
## 56                                                                                                          Aidan Wainwright(0)
## 163                                           Zhang Yimou(0.6), Yiqun Jia(0.1), Li Yongyi(0), Zhao Xiaoding(0.1), Han Hong(0.1)
## 257                                                                                                                  DJANITA(0)
## 286                                                  Peng Damo(0.1), Yan Fei(0.1), Yan Fei(0.1), Peng Damo(0.1), Lin Bingbao(0)
## 294                                                         Han Han(0.3), Han Han(0.3), Li Jie(0.1), Lu Jia(0), Chen Zhixi(0.2)
## 302                                               Yuyue Shen(0.1), Chenguang Qian(0), Si'ao Dai(0), Zhang Chiyu(0), Zhang Li(0)
## 313                                               Faye Wong(0.6), Wang Yibing(0.1), Binxing Fu(0.1), Xiaobei Cao(0), Xin Liu(0)
## 323                                      Chen Sicheng(0.7), Chen Sicheng(0.7), Dai Mo(0.1), Nathan Wang(0.4), Chen Sicheng(0.7)
## 338                                                                                                 Lug Zajonc(0), Dan Braun(0)
## 386                                                                             Xing Wenxiong(0), Xing Wenxiong(0), Fei Peng(0)
## 395                                    Raman Hui Shing-Ngai(0.4), Alan Yuen(0.3), Alan Yuen(0.3), Hao Lee(0), Ho Yiu-Leung(0.1)
## 415                                                  Duan Aojuan(0.1), Meng Meiqi(0.2), Xu Mengjie(0.7), Yamy(0.1), Sunnee(0.1)
## 449                                     Su Liang(0.2), Hao Lee(0), Shadow Hung(0), Ho Yiu-Leung(0.1), Raman Hui Shing-Ngai(0.4)
## 455                      Masanori Yumiya(0), Shinichirou Inoue(0.1), Tatsuro Hatanaka(0.1), Junji Zenki(0.1), Ken Sakamoto(0.1)
## 498                                      Li Meng(0.1), Xiaobei Cao(0), Li Chuan-Long(0), Zhao Xiaoding(0.1), Chen Minzheng(0.1)
## 502                           Billy Daniel(0.1), Walt Disney(0.7), Donald Halliday(0.1), Norman Ferguson(0.2), Eric Larson(0.2)
## 507                                       Zhang Chiyu(0), Zhang Chiyu(0), Derek Hui Wang-Yu(0.2), Si'ao Dai(0), Yuyue Shen(0.1)
## 530                                     Yojiro Noda(0.5), Yojiro Noda(0.5), Kazuma Jinnouchi(0.2), Toaka(0), Yusuke Takeda(0.1)
## 695                       Takehiko Inoue(0.3), Yasuyuki Ebara(0.3), Haruka Kamatani(0), Ryuichi Takita(0.1), Yota Tsuruoka(0.5)
## 714                                                    Xie Junwei(0.1), Zou Jing(0), Gary Wang(0.2), Li Bai(0), Guo Haowei(0.1)
## 746                                   Peter Kam Pau-Tat(0.2), Shengji Liang(0), Yu Yonggan(0.1), Barfuss Hui(0), Tony Chan(0.2)
## 765                                        Li Ruijie(0.1), Jing Wang(0.1), Li Ruijie(0.1), Wei Yunyun(0.1), Wang Changtian(0.1)
## 778                                               Frant Gwo(0.8), Rao Xiaozhi(0.2), Faye Wong(0.6), Shi Ce(0), Lei Zhilong(0.1)
## 802                   Antonio Racioppi(0.1), Gino Capone(0.2), Carlo Rustichelli(0.1), Ignazio Dolce(0.2), Alessandro Fallai(0)
## 805                                       Derek Hui Wang-Yu(0.2), Yan Fei(0.1), Peng Damo(0.1), Zhou Xiaolin(0), Tian Tian(0.1)
## 820                                                    Gao Ke(0), Peter Chan(0.4), Lili Sun(0.1), Shujie Zong(0), Hong Qin(0.1)
## 825                                                      Yu Gong(0.8), Feng Xiaogang(0.6), Yan Geling(0.5), Luo Pan(0), 张琪(0)
## 835                                         Yongchang Lin(0.1), Heqi Shao(0), Yongchang Lin(0.1), Heqi Shao(0), Tiezhi Cui(0.1)
## 863                                              Wang Gang(0.1), Rui Zhong(0), Zhao Yingjun(0.2), Yan Han(0.4), Jiawei Jia(0.1)
## 869                            Zhang Dajun(0.1), Ivy Kong Yuk-Yee(0.2), Stephen Chow(3.2), Stephen Chow(3.2), Ellen Eliasoph(0)
## 899                                               Huang Chao(0), Wei Zhong(0.1), Mingyi Liu(0), Wang Yibing(0.1), Wen Muye(0.2)
## 905                                                   Yuan Yuan(0.2), Yu Pan(0), Dong Ping(0.2), Liu Rong(0), Zheng Zhihao(0.1)
## 928 Flora Au Wai-Ling(0.1), Nicolas Errèra(0.1), Tim Tong Yiu-Leung(0.1), Benny Chan Muk-Sing(0.6), Nicholas Tse Ting-Fung(1.8)
## 952                                                                 Carol Burandt von Kameke(0), Sven Halfar(0), Sven Halfar(0)
## 957                                                      Henry Kieffer(0), Henry Kieffer(0), Henry Kieffer(0), Henry Kieffer(0)
## 960                                                 David Miller(0.2), Art Arthur(0.1), Gene Fowler Jr.(0.1), David Miller(0.2)
## 979                   Gege Akutami(0.2), Katsuhiro Nakano(0.3), Alisa Okehazama(0.1), Tadashi Hiramatsu(0.4), Sunghoo Park(0.4)
## 995                                                      Boon-lip Quah(0.1), Boon-lip Quah(0.1), You Wenwei(0.1), Zhang Ying(0)
##     actor_avg_pop crew_avg_pop
## 56        0.00000     0.000000
## 163       1.02544     0.202460
## 257            NA     0.000000
## 286       0.51680     0.091820
## 294       0.63448     0.169860
## 302       0.53308     0.040080
## 313       0.96548     0.167260
## 323       1.16570     0.524720
## 338       0.27175     0.013050
## 386       0.44024     0.036300
## 395       0.63568     0.215140
## 415       0.46784     0.251380
## 449       0.86762     0.142740
## 455       1.26204     0.076240
## 498       1.59452     0.061680
## 502       0.28960     0.268240
## 507       0.56262     0.063580
## 530       0.79050     0.274200
## 695       0.74016     0.220360
## 714       0.20390     0.073980
## 746       1.84240     0.115940
## 765       0.32410     0.097660
## 778       0.67318     0.333360
## 802       0.33376     0.118660
## 805       0.63026     0.099260
## 820       0.92132     0.117140
## 825       0.65676     0.408740
## 835       0.09998     0.081580
## 863       1.00314     0.165580
## 869       1.00910     1.343680
## 899       0.89386     0.080240
## 905       0.54952     0.097180
## 928       1.56860     0.516760
## 952       0.41146     0.025500
## 957       0.00000     0.000000
## 960       0.70960     0.152525
## 979       1.77070     0.270960
## 995       0.86034     0.086000

0 runtime movies

zero_runtime <- movies$runtime == 0
movies[zero_runtime,]
##     movie_id                                   title   revenue popularity
## 231  1454250                                 Salada. 541441574     0.0071
## 258  1432484 The World of the Pharaoh (Live Theatre) 500000000     0.0338
##      budget release_date runtime genres                                  actors
## 231      40   2025-03-25       0  Crime Matheus Ribeiro(0), Salada(0), Garfo(0)
## 258 1000000   2025-02-09       0                                Martín Cirio(0)
##                                  crews actor_avg_pop crew_avg_pop
## 231 Pedro Augusto(0), Pedro Augusto(0)   0.006433333      0.00355
## 258                    Martín Cirio(0)   0.032700000      0.03270

Remove zero values

movies <- filter(movies, budget > 0 & runtime > 0)

Movies without actor

actors_na <- is.na(movies$actors)
movies[actors_na,]
##     movie_id            title    revenue popularity  budget release_date
## 55   1454531 KUCHAR x CLINTON 1000000000     0.0071 1000000         <NA>
## 920  1148594              ere  200202022     0.1475 1209097         <NA>
##     runtime genres actors               crews actor_avg_pop crew_avg_pop
## 55       41          <NA> Derwood Shmedley(0)            NA            0
## 920     120          <NA>                <NA>            NA           NA

Movies without crew

crews_na <- is.na(movies$crews)
movies[crews_na,]
##     movie_id                                 title   revenue popularity  budget
## 364  1425681 Lyna Live From Buenos Aires: Gran Rex 400000000     0.0214 1000000
## 920  1148594                                   ere 200202022     0.1475 1209097
##     release_date runtime       genres  actors crews actor_avg_pop crew_avg_pop
## 364   2023-01-01      90 Family,Music Lyna(0)  <NA>        0.0071           NA
## 920         <NA>     120                 <NA>  <NA>            NA           NA

Movies without release date

release_date_na <- is.na(movies$release_date)
movies[release_date_na,]
##     movie_id            title    revenue popularity  budget release_date
## 55   1454531 KUCHAR x CLINTON 1000000000     0.0071 1000000         <NA>
## 920  1148594              ere  200202022     0.1475 1209097         <NA>
## 921  1433493         Deerfeat  200000000     0.0214     400         <NA>
## 922  1034806         The Pint  200000000     0.0264       3         <NA>
##     runtime                genres
## 55       41                      
## 920     120                      
## 921       5 Action,Horror,Mystery
## 922       5                      
##                                                                                              actors
## 55                                                                                             <NA>
## 920                                                                                            <NA>
## 921                                                            Justice Gondek(0), Justice Gondek(0)
## 922 Joseph Glenn(0), Iain Bayfield(0.1), Callum White(0), Henry Swindells(0), Uilleam MacDougall(0)
##                   crews actor_avg_pop crew_avg_pop
## 55  Derwood Shmedley(0)            NA       0.0000
## 920                <NA>            NA           NA
## 921      Todd Gondek(0)       0.01430       0.0000
## 922      Zoey Thomas(0)       0.02908       0.0143

Remove NA values

movies <- filter(movies, !is.na(release_date) & !is.na(actors) & !is.na(crews))
colSums(is.na(movies))
##      movie_id         title       revenue    popularity        budget 
##             0             0             0             0             0 
##  release_date       runtime        genres        actors         crews 
##             0             0             0             0             0 
## actor_avg_pop  crew_avg_pop 
##             0             0

Recheck the values

cat("Summary for budget:\n")
## Summary for budget:
summary(movies$budget)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     50000  50000000  88000000 100782583 150000000 460000000
cat("Summary for runtime:\n")
## Summary for runtime:
summary(movies$runtime)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     4.0   103.0   116.0   118.9   131.5   233.0

Remove movies with an out-of-place runtime

movies <- filter(movies, runtime != 4)
summary(movies$runtime)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    70.0   103.0   116.5   119.0   131.8   233.0

Check movies released in 2025

movies[movies$release_year == 2025,]
##  [1] movie_id      title         revenue       popularity    budget       
##  [6] release_date  runtime       genres        actors        crews        
## [11] actor_avg_pop crew_avg_pop 
## <0 rows> (or 0-length row.names)
# Extract year from release date
movies$release_year <- year(movies$release_date)
movies <- filter(movies, release_year != 2025)

Since there are only two movies from 2025, we’re going to remove movies from 2025 so that our analysis is not biased

Pre-processing data

Convert release_date to season

movies$season <- as.factor(case_when(
  format(movies$release_date, "%m") %in% c("12", "01", "02") ~ "Winter",
  format(movies$release_date, "%m") %in% c("03", "04", "05") ~ "Spring",
  format(movies$release_date, "%m") %in% c("06", "07", "08") ~ "Summer",
  format(movies$release_date, "%m") %in% c("09", "10", "11") ~ "Fall",
  TRUE ~ "Unknown"
))
levels(movies$season)
## [1] "Fall"   "Spring" "Summer" "Winter"

Convert genres into one-hot encoding

movies <- movies |>
  separate_rows(genres, sep = ",") |>
  mutate(value = 1) |>
  pivot_wider(names_from = genres, values_from = value, values_fill = list(value = 0))

# Take only movies from 1947 since the data for CPI starts from 1947
cpi <- read.csv("CPIAUCSL.csv")
cpi$observation_month <- as.numeric(substring(cpi$observation_date, 6, 7))
cpi$observation_year <- as.numeric(substring(cpi$observation_date, 1, 4))
movies$release_month <- as.numeric(substring(movies$release_date, 6, 7))
movies <- filter(movies, release_year >= 1946) |>
  left_join(cpi, by = c("release_year" = "observation_year", "release_month" = "observation_month"))
# Check if there are any missing values after the join
missing_values <- movies[!complete.cases(movies), ]
missing_values
## # A tibble: 0 × 34
## # ℹ 34 variables: movie_id <int>, title <chr>, revenue <dbl>, popularity <dbl>,
## #   budget <int>, release_date <date>, runtime <int>, actors <chr>,
## #   crews <chr>, actor_avg_pop <dbl>, crew_avg_pop <dbl>, release_year <dbl>,
## #   season <fct>, Action <dbl>, Adventure <dbl>, Fantasy <dbl>,
## #   Science Fiction <dbl>, Drama <dbl>, Romance <dbl>, Animation <dbl>,
## #   Comedy <dbl>, Family <dbl>, Thriller <dbl>, Crime <dbl>, History <dbl>,
## #   Music <dbl>, War <dbl>, Mystery <dbl>, Horror <dbl>, Western <dbl>, …

Budget time series before accounting for inflation

# Compute average revenue per year
yearly_budget <- movies |>
  group_by(release_year) |>
  summarise(avg_budget = mean(budget, na.rm = TRUE), movie_count = n()) |>
  arrange(release_year)

# Plot time series
ggplot(yearly_budget, aes(x = release_year, y = avg_budget)) +
  geom_line(color = "#e74c3c", size = 1.5) +  # Bold red line
  geom_point(color = "#2c3e50", size = 3) +  # Dark gray dots
  theme_minimal() +
  labs(
    title = "Average Movie Budget Over Time",
    x = "Release Year",
    y = "Average Budget ($)"
  ) +
  scale_y_continuous(labels = scales::dollar_format()) +  # Format revenue as dollars
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#2c3e50"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 13, face = "bold")
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Adjust inflation for budget (can’t do for revenue since it mostly includes re-releases)

# Get largest release date to convert to
latest_release_date <- max(movies$release_date, na.rm = TRUE)
latest_release_month <- as.numeric(substring(latest_release_date, 6, 7))
latest_release_year <- as.numeric(substring(latest_release_date, 1, 4))
# Adjust budget for inflation
cpi_latest <- filter(cpi, observation_year == latest_release_year & observation_month == latest_release_month)$CPIAUCSL
movies <- movies |>
  mutate(budget = budget * (cpi_latest / CPIAUCSL))

# Replot the time series for budget
yearly_budget <- movies |>
  group_by(release_year) |>
  summarise(avg_budget = mean(budget, na.rm = TRUE), movie_count = n()) |>
  arrange(release_year)

# Plot time series
ggplot(yearly_budget, aes(x = release_year, y = avg_budget)) +
  geom_line(color = "#e74c3c", size = 1.5) +  # Bold red line
  geom_point(color = "#2c3e50", size = 3) +  # Dark gray dots
  theme_minimal() +
  labs(
    title = "Average Movie Budget Over Time",
    x = "Release Year",
    y = "Average Budget ($)"
  ) +
  scale_y_continuous(labels = scales::dollar_format()) +  # Format revenue as dollars
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#2c3e50"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 13, face = "bold")
  )

Store the cleaned dataset

store_movies <- movies
store_movies[] <- lapply(movies, function(x) {
  if (is.character(x)) Encoding(x) <- "UTF-8"
  return(x)
})
saveRDS(store_movies, file = "movies_cleaned.rds")

Exploratory data analysis (EDA)

Numerical summaries

cat("Summary statistics for key numeric or date variables:\n")
## Summary statistics for key numeric or date variables:
summary(select(movies, revenue, popularity, budget, release_date, runtime, actor_avg_pop, crew_avg_pop))
##     revenue            popularity           budget         
##  Min.   :1.915e+08   Min.   :  0.3816   Min.   :   114314  
##  1st Qu.:2.457e+08   1st Qu.:  7.6284   1st Qu.: 79038787  
##  Median :3.390e+08   Median : 11.2986   Median :132731340  
##  Mean   :4.453e+08   Mean   : 14.8793   Mean   :145774730  
##  3rd Qu.:5.242e+08   3rd Qu.: 16.9450   3rd Qu.:207913124  
##  Max.   :2.924e+09   Max.   :147.0271   Max.   :535446283  
##   release_date           runtime      actor_avg_pop       crew_avg_pop    
##  Min.   :1950-02-22   Min.   : 72.0   Min.   : 0.09326   Min.   :0.04116  
##  1st Qu.:2002-11-02   1st Qu.:103.0   1st Qu.: 1.95674   1st Qu.:0.19500  
##  Median :2011-04-11   Median :116.5   Median : 2.91181   Median :0.31440  
##  Mean   :2009-02-20   Mean   :119.0   Mean   : 3.32663   Mean   :0.48676  
##  3rd Qu.:2016-12-28   3rd Qu.:131.2   3rd Qu.: 4.21701   3rd Qu.:0.55189  
##  Max.   :2024-12-19   Max.   :224.0   Max.   :12.08894   Max.   :4.61122
cat("\nCount of movies for each season:")
## 
## Count of movies for each season:
table(movies$season)
## 
##   Fall Spring Summer Winter 
##    215    227    302    204

Response variable (revenue)

# Original scale
ggplot(movies, aes(x = revenue)) +
  geom_histogram(bins = 30, fill = "#3498db", color = "white", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Distribution of Movie Revenue", 
       x = "Revenue", 
       y = "Frequency") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))

# Log scale
ggplot(movies, aes(x = log10(revenue))) +
  geom_histogram(bins = 30, fill = "#3498db", color = "white", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Distribution of Movie Revenue (Log Scale)", 
       x = "Log10(Revenue)", 
       y = "Frequency") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))

Revenue vs. Budget

ggplot(movies, aes(x = budget, y = revenue)) +
  geom_point(alpha = 0.4, color = "#e74c3c") +
  geom_smooth(method = "lm", color = "#2c3e50", se = FALSE, linetype = "dashed") +
  theme_minimal() +
  labs(title = "Budget vs. Revenue", 
       x = "Budget", 
       y = "Revenue") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'

ggplot(movies, aes(x = log10(budget), y = log10(revenue))) +
  geom_point(alpha = 0.4, color = "#e74c3c") +
  geom_smooth(method = "lm", color = "#2c3e50", se = FALSE, linetype = "dashed") +
  theme_minimal() +
  labs(title = "Budget vs. Revenue", 
       x = "Log10(Budget)", 
       y = "Log10(Revenue)") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'

Revenue vs. Popularity

## Revenue vs. Popularity
ggplot(movies, aes(x = popularity, y = revenue)) +
  geom_point(alpha = 0.5, color = "#3498db") +
  geom_smooth(method = "lm", color = "#2c3e50", se = FALSE, linetype = "dashed") +
  theme_minimal() +
  labs(
    title = "Movie Popularity vs. Revenue",
    x = "TMDb Popularity Score",
    y = "Revenue"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 13, face = "bold")
  )
## `geom_smooth()` using formula = 'y ~ x'

ggplot(movies, aes(x = popularity, y = log10(revenue))) +
  geom_point(alpha = 0.5, color = "#3498db") +
  geom_smooth(method = "lm", color = "#2c3e50", se = FALSE, linetype = "dashed") +
  theme_minimal() +
  labs(
    title = "Movie Popularity vs. Revenue",
    x = "TMDb Popularity Score",
    y = "Log10(Revenue)"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 13, face = "bold")
  )
## `geom_smooth()` using formula = 'y ~ x'

Revenue vs. Runtime

ggplot(movies, aes(x = runtime, y = log10(revenue))) +
  geom_point(alpha = 0.6, color = "#E74C3C") +  # Vibrant Coral Red
  geom_smooth(method = "lm", color = "#2c3e50", se = FALSE, linetype = "dashed") +
  theme_minimal() +
  labs(title = "Movie Runtime vs. Revenue",
       x = "Runtime (minutes)",
       y = "Log10(Revenue)") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.text = element_text(size = 12),  
    axis.title = element_text(size = 13, face = "bold")
  )
## `geom_smooth()` using formula = 'y ~ x'

Revenue vs. Release Season

ggplot(movies, aes(x = season, y = log10(revenue), fill = season)) +
  geom_boxplot(alpha = 0.85, outlier.shape = 21, outlier.fill = "white", outlier.color = "black") +
  scale_fill_manual(values = c("Winter" = "#74b9ff",
                               "Spring" = "#00b894",
                               "Summer" = "#f9ca24",
                               "Fall" = "#d35400")) +
  theme_minimal() +
  labs(title = "Movie Revenue by Release Season", 
       x = "Season", 
       y = "Log10(Revenue)") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    legend.position = "none",
    axis.text.x = element_text(face = "bold", size = 12)
  )

Revenue vs. Release Year

# Compute average revenue per year
yearly_revenue <- movies |>
  group_by(release_year) |>
  summarise(avg_revenue = mean(revenue, na.rm = TRUE), movie_count = n()) |>
  arrange(release_year)

# Plot time series
ggplot(yearly_revenue, aes(x = release_year, y = avg_revenue)) +
  geom_line(color = "#e74c3c", size = 1.5) +  # Bold red line
  geom_point(color = "#2c3e50", size = 3) +  # Dark gray dots
  theme_minimal() +
  labs(
    title = "Average Movie Revenue Over Time",
    x = "Release Year",
    y = "Average Revenue ($)"
  ) +
  scale_y_continuous(labels = scales::dollar_format()) +  # Format revenue as dollars
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#2c3e50"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 13, face = "bold")
  )

Revenue vs. Genres

# Define colors based on genre themes
genre_colors <- c(
  "Action" = "#e74c3c",      
  "Adventure" = "#f39c12",   
  "Animation" = "#f1c40f",      
  "Comedy" = "#2ecc71",       
  "Crime" = "#34495e",        
  "Documentary" = "#95a5a6",    
  "Drama" = "#9b59b6",     
  "Family" = "#e67e22",    
  "Fantasy" = "#8e44ad",       
  "History" = "#d35400",       
  "Horror" = "#c0392b",     
  "Music" = "#1abc9c",     
  "Mystery" = "#16a085",       
  "Romance" = "#e84393",      
  "Science Fiction" = "#27ae60",
  "TV Movie" = "#7f8c8d",      
  "Thriller" = "#2c3e50",     
  "War" = "#bdc3c7",            
  "Western" = "#8d6e63"
)

# Prepare genre-specific revenue data
genre_revenue <- movies |>
  select(starts_with("Action"):starts_with("Documentary"), revenue) |>
  pivot_longer(cols = starts_with("Action"):starts_with("Documentary"), names_to = "Genre", values_to = "Presence") |>
  filter(Presence == 1)

# Boxplot of revenue by genre
ggplot(genre_revenue, aes(x = reorder(Genre, log10(revenue), FUN = median), y = log10(revenue), fill = Genre)) +
  geom_boxplot(alpha = 0.85, outlier.shape = 21, outlier.fill = "white", outlier.color = "black") +
  scale_fill_manual(values = genre_colors) +
  coord_flip() +  # Horizontal layout for readability
  theme_minimal() +
  labs(title = "Revenue Distribution by Genre",
       x = "Genre",
       y = "Log10(Revenue)") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    legend.position = "none",
    axis.text.x = element_text(face = "bold", size = 12),
    axis.text.y = element_text(face = "bold", size = 10)
  )

highest_history_movie <- movies |>
  filter(History == 1) |>
  arrange(desc(revenue)) |>
  slice_max(revenue, n = 1) 

highest_history_movie
## # A tibble: 1 × 34
##   movie_id title     revenue popularity budget release_date runtime actors crews
##      <int> <chr>       <dbl>      <dbl>  <dbl> <date>         <int> <chr>  <chr>
## 1   872585 Oppenhei…  9.52e8       28.0 1.04e8 2023-07-19       181 Cilli… Andr…
## # ℹ 25 more variables: actor_avg_pop <dbl>, crew_avg_pop <dbl>,
## #   release_year <dbl>, season <fct>, Action <dbl>, Adventure <dbl>,
## #   Fantasy <dbl>, `Science Fiction` <dbl>, Drama <dbl>, Romance <dbl>,
## #   Animation <dbl>, Comedy <dbl>, Family <dbl>, Thriller <dbl>, Crime <dbl>,
## #   History <dbl>, Music <dbl>, War <dbl>, Mystery <dbl>, Horror <dbl>,
## #   Western <dbl>, Documentary <dbl>, release_month <dbl>,
## #   observation_date <chr>, CPIAUCSL <dbl>

Revenue vs. Actor’s popularity

ggplot(movies, aes(x = actor_avg_pop, y = log10(revenue))) +
  geom_point(alpha = 0.5, color = "#9b59b6") +
  geom_smooth(method = "lm", col = "#2c3e50", se = FALSE, linetype = "dashed") +
  theme_minimal() +
  labs(title = "Actor Popularity vs. Revenue", 
       x = "Average Actor Popularity", 
       y = "Log10(Revenue)") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'

Revenue vs. Crew’s popularity

ggplot(movies, aes(x = crew_avg_pop, y = log10(revenue))) +
  geom_point(alpha = 0.5, color = "#e74c3c") +
  geom_smooth(method = "lm", col = "#2c3e50", se = FALSE, linetype = "dashed") +
  theme_minimal() +
  labs(title = "Crew Popularity vs. Revenue",
       x = "Average Crew Popularity",
       y = "Log10(Revenue)") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 13, face = "bold")
  )
## `geom_smooth()` using formula = 'y ~ x'

Top actors with highest average revenue, total revenue, and movie count

# Extract actor names without popularity score
top_actors_total <- movies |>
  separate_rows(actors, sep = ", ") |>
  mutate(actor_name = str_extract(actors, "^[^(]+")) |>  # Extract only the name, ignoring numbers
  group_by(actor_name) |>
  summarise(total_revenue = sum(revenue, na.rm = TRUE)) |>
  arrange(desc(total_revenue)) |>
  slice_max(total_revenue, n = 20, with_ties = FALSE)  

# Plot
ggplot(top_actors_total, aes(x = reorder(actor_name, total_revenue), y = total_revenue, fill = total_revenue)) +
  geom_bar(stat = "identity", alpha = 0.9) +
  coord_flip() +
  scale_fill_gradient(low = "#F5B7B1", high = "#922B21") +  # Light Pink → Deep Red
  theme_minimal() +
  labs(
    title = "Top 20 Actors by Total Movie Revenue",
    x = "Actor",
    y = "Total Revenue ($)",
    fill = "Total Revenue"
  ) +
  scale_y_continuous(labels = scales::dollar_format()) +  
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#641E16"),
    axis.text.x = element_text(angle = 30, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12, face = "bold")
  )

# Extract top actors by average revenue per movie
top_actors_avg <- movies |>
  separate_rows(actors, sep = ", ") |>
  mutate(actor_name = str_extract(actors, "^[^(]+")) |>  
  group_by(actor_name) |>
  summarise(avg_revenue = mean(revenue, na.rm = TRUE)) |>
  arrange(desc(avg_revenue)) |>
  slice_max(avg_revenue, n = 20, with_ties = FALSE)  

# Plot
ggplot(top_actors_avg, aes(x = reorder(actor_name, avg_revenue), y = avg_revenue, fill = avg_revenue)) +
  geom_bar(stat = "identity", alpha = 0.9) +
  coord_flip() +
  scale_fill_gradient(low = "#F8DE7E", high = "#D4AC0D") +  # Light Gold → Deep Gold
  theme_minimal() +
  labs(
    title = "Top 20 Actors by Average Movie Revenue",
    x = "Actor",
    y = "Average Revenue per Movie ($)",
    fill = "Avg Revenue"
  ) +
  scale_y_continuous(labels = scales::dollar_format()) +  
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#7D6608"),
    axis.text.x = element_text(angle = 30, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12, face = "bold")
  )

# Extract top actors by movie count
top_actor_counts <- movies |>
  separate_rows(actors, sep = ", ") |>
  mutate(actor_name = str_extract(actors, "^[^(]+")) |>  
  group_by(actor_name) |>
  summarise(movie_count = n()) |>
  arrange(desc(movie_count)) |>
  slice_max(movie_count, n = 20, with_ties = FALSE)  

# Plot
ggplot(top_actor_counts, aes(x = reorder(actor_name, movie_count), y = movie_count, fill = movie_count)) +
  geom_bar(stat = "identity", alpha = 0.9) +
  coord_flip() +
  scale_fill_gradient(low = "#FAD7A0", high = "#D35400") +  # Light Orange → Deep Orange
  theme_minimal() +
  labs(
    title = "Top 20 Actors by Number of Movies",
    x = "Actor",
    y = "Movie Count",
    fill = "Movie Count"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#B34700"),
    axis.text.x = element_text(angle = 30, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12, face = "bold")
  )

Top crews with highest average revenue, total revenue, and movie count

# Extract top crew members by total revenue
top_crew_total <- movies |>
  separate_rows(crews, sep = ", ") |>
  mutate(crew_name = str_extract(crews, "^[^(]+")) |>  
  group_by(crew_name) |>
  summarise(total_revenue = sum(revenue, na.rm = TRUE)) |>
  arrange(desc(total_revenue)) |>
  slice_max(total_revenue, n = 20, with_ties = FALSE)  

# Plot
ggplot(top_crew_total, aes(x = reorder(crew_name, total_revenue), y = total_revenue, fill = total_revenue)) +
  geom_bar(stat = "identity", alpha = 0.9) +
  coord_flip() +
  scale_fill_gradient(low = "#A2D9CE", high = "#117A65") +  # Light Teal → Deep Green
  theme_minimal() +
  labs(
    title = "Top 20 Crew Members by Total Movie Revenue",
    x = "Crew Member",
    y = "Total Revenue ($)",
    fill = "Total Revenue"
  ) +
  scale_y_continuous(labels = scales::dollar_format()) +  
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#154360"),
    axis.text.x = element_text(angle = 30, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12, face = "bold")
  )

# Extract top crew members by average revenue per movie
top_crew_avg <- movies |>
  separate_rows(crews, sep = ", ") |>
  mutate(crew_name = str_extract(crews, "^[^(]+")) |>  
  group_by(crew_name) |>
  summarise(avg_revenue = mean(revenue, na.rm = TRUE)) |>
  arrange(desc(avg_revenue)) |>
  slice_max(avg_revenue, n = 20, with_ties = FALSE)  

# Plot
ggplot(top_crew_avg, aes(x = reorder(crew_name, avg_revenue), y = avg_revenue, fill = avg_revenue)) +
  geom_bar(stat = "identity", alpha = 0.9) +
  coord_flip() +
  scale_fill_gradient(low = "#85C1E9", high = "#1B4F72") +  # Light Blue → Deep Blue
  theme_minimal() +
  labs(
    title = "Top 20 Crew Members by Average Movie Revenue",
    x = "Crew Member",
    y = "Average Revenue per Movie ($)",
    fill = "Avg Revenue"
  ) +
  scale_y_continuous(labels = scales::dollar_format()) +  
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#154360"),
    axis.text.x = element_text(angle = 30, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12, face = "bold")
  )

# Extract top crew members by movie count
top_crew_counts <- movies |>
  separate_rows(crews, sep = ", ") |>
  mutate(crew_name = str_extract(crews, "^[^(]+")) |>  
  group_by(crew_name) |>
  summarise(movie_count = n()) |>
  arrange(desc(movie_count)) |>
  slice_max(movie_count, n = 20, with_ties = FALSE)  

# Plot
ggplot(top_crew_counts, aes(x = reorder(crew_name, movie_count), y = movie_count, fill = movie_count)) +
  geom_bar(stat = "identity", alpha = 0.9) +
  coord_flip() +
  scale_fill_gradient(low = "#D7BDE2", high = "#6C3483") +  # Light Purple → Deep Violet
  theme_minimal() +
  labs(
    title = "Top 20 Crew Members by Number of Movies",
    x = "Crew Member",
    y = "Movie Count",
    fill = "Movie Count"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#512E5F"),
    axis.text.x = element_text(angle = 30, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12, face = "bold")
  )

Correlation heatmap of key numeric variables

# Select only numerical columns
numeric_vars <- select(movies, revenue, popularity, budget, runtime, actor_avg_pop, crew_avg_pop)

# Compute correlation matrix
corr_matrix <- cor(numeric_vars, use = "complete.obs")

# Define a custom theme for better aesthetics
ggcorrplot(corr_matrix, method = "square",
           colors = c("#1f77b4", "white", "#d62728"),  # Blue (negative) → White (neutral) → Red (positive)
           lab = TRUE,  # Show correlation values
           lab_size = 5,  # Larger labels
           outline.color = "black",  # Borders for better contrast
           ggtheme = theme_minimal()) +  # Minimalist theme
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#2c3e50"),  # Elegant title
    panel.grid.major = element_blank(),  # Remove grid lines for a clean look
    panel.border = element_blank(),  # No outer border
    axis.text.x = element_text(size = 12, angle = 45, hjust = 1, color = "#2c3e50", face = "bold"),  # Rotated x-labels
    axis.text.y = element_text(size = 12, color = "#2c3e50", face = "bold")  # Bold y-labels
  ) +
  labs(title = "Correlation Heatmap: Movie Revenue vs Predictors")

Some statistical analysis

Linear relationship analysis for numeric predictors

lm_model <- lm(log10(revenue) ~ log10(budget) + popularity + runtime + actor_avg_pop + crew_avg_pop, data = movies)

summary(lm_model)
## 
## Call:
## lm(formula = log10(revenue) ~ log10(budget) + popularity + runtime + 
##     actor_avg_pop + crew_avg_pop, data = movies)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.54952 -0.13534 -0.02981  0.12426  0.66769 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.5760463  0.1440520  45.650  < 2e-16 ***
## log10(budget)  0.2220063  0.0184891  12.007  < 2e-16 ***
## popularity     0.0050809  0.0004400  11.547  < 2e-16 ***
## runtime        0.0010715  0.0002969   3.609 0.000323 ***
## actor_avg_pop  0.0037404  0.0034444   1.086 0.277782    
## crew_avg_pop  -0.0033722  0.0112840  -0.299 0.765119    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.191 on 942 degrees of freedom
## Multiple R-squared:  0.3103, Adjusted R-squared:  0.3067 
## F-statistic: 84.78 on 5 and 942 DF,  p-value: < 2.2e-16

Linear relationship analysis for each genre

lm_model <- lm(
  log10(revenue) ~ Action + Adventure + Fantasy + `Science Fiction` + Drama + Romance + 
    Animation + Comedy + Family + Thriller + Crime + History + Music + War + 
    Mystery + Horror + Western + Documentary, 
  data = movies
)

summary(lm_model)
## 
## Call:
## lm(formula = log10(revenue) ~ Action + Adventure + Fantasy + 
##     `Science Fiction` + Drama + Romance + Animation + Comedy + 
##     Family + Thriller + Crime + History + Music + War + Mystery + 
##     Horror + Western + Documentary, data = movies)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45437 -0.14608 -0.03736  0.12388  0.83213 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        8.553324   0.025006 342.056  < 2e-16 ***
## Action             0.018428   0.018455   0.999  0.31829    
## Adventure          0.095560   0.017211   5.552 3.68e-08 ***
## Fantasy            0.034539   0.019141   1.804  0.07149 .  
## `Science Fiction`  0.047815   0.019700   2.427  0.01541 *  
## Drama             -0.026308   0.019756  -1.332  0.18331    
## Romance           -0.004239   0.024257  -0.175  0.86133    
## Animation          0.078728   0.028879   2.726  0.00653 ** 
## Comedy            -0.075516   0.018622  -4.055 5.43e-05 ***
## Family            -0.022729   0.027720  -0.820  0.41245    
## Thriller          -0.032745   0.020138  -1.626  0.10428    
## Crime             -0.035385   0.024929  -1.419  0.15611    
## History           -0.087570   0.040227  -2.177  0.02974 *  
## Music             -0.005852   0.046740  -0.125  0.90039    
## War                0.015510   0.042627   0.364  0.71605    
## Mystery           -0.031443   0.031308  -1.004  0.31548    
## Horror            -0.093029   0.034768  -2.676  0.00759 ** 
## Western           -0.113331   0.080746  -1.404  0.16078    
## Documentary       -0.168344   0.151804  -1.109  0.26773    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2098 on 929 degrees of freedom
## Multiple R-squared:  0.1793, Adjusted R-squared:  0.1634 
## F-statistic: 11.28 on 18 and 929 DF,  p-value: < 2.2e-16

ANOVA for categorical predictors

anova_model <- aov(log10(revenue) ~ season, data = movies)
summary(anova_model)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## season        3   0.93 0.30925   5.969 0.000496 ***
## Residuals   944  48.91 0.05181                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Build Random Forest model

Verify variable names and types

str(movies)
## tibble [948 × 34] (S3: tbl_df/tbl/data.frame)
##  $ movie_id        : int [1:948] 19995 299534 76600 597 140607 299536 634649 1022789 135397 420818 ...
##  $ title           : chr [1:948] "Avatar" "Avengers: Endgame" "Avatar: The Way of Water" "Titanic" ...
##  $ revenue         : num [1:948] 2.92e+09 2.80e+09 2.32e+09 2.26e+09 2.07e+09 ...
##  $ popularity      : num [1:948] 27.7 63.2 39.5 44.4 16.4 ...
##  $ budget          : num [1:948] 3.46e+08 4.43e+08 4.89e+08 3.93e+08 3.27e+08 ...
##  $ release_date    : Date[1:948], format: "2009-12-15" "2019-04-24" ...
##  $ runtime         : int [1:948] 162 181 192 194 136 149 148 97 124 118 ...
##  $ actors          : chr [1:948] "Sam Worthington(2.8), Zoe Saldaña(9), Sigourney Weaver(3.6), Stephen Lang(1.7), Michelle Rodriguez(4.6)" "Robert Downey Jr.(11.1), Chris Evans(7.5), Mark Ruffalo(3.4), Chris Hemsworth(13.5), Scarlett Johansson(20)" "Sam Worthington(2.8), Zoe Saldaña(9), Sigourney Weaver(3.6), Stephen Lang(1.7), Kate Winslet(6.9)" "Leonardo DiCaprio(8.8), Kate Winslet(6.9), Billy Zane(1.7), Kathy Bates(2.3), Frances Fisher(2.1)" ...
##  $ crews           : chr [1:948] "James Cameron(2.9), James Cameron(2.9), Ilram Choi(0.1), Woody Schultz(0.1), Mauro Fiore(0.1)" "Paul Schneider(0.1), Louis D'Esposito(0.5), Carlos Pacheco(0.1), John David Duncan(0.1), Christopher Kelly(0.2)" "James Cameron(2.9), James Cameron(2.9), James Cameron(2.9), Michele Perry(0.1), Carly Marr(0)" "Martin Laing(0.1), Russell Carpenter(0.2), James Cameron(2.9), Giedra Rackauskas(0), Peter Lamont(0.2)" ...
##  $ actor_avg_pop   : num [1:948] 4.33 11.11 4.8 4.35 2.77 ...
##  $ crew_avg_pop    : num [1:948] 1.231 0.198 1.777 0.691 0.904 ...
##  $ release_year    : num [1:948] 2009 2019 2022 1997 2015 ...
##  $ season          : Factor w/ 4 levels "Fall","Spring",..: 4 2 4 1 4 2 4 3 3 3 ...
##  $ Action          : num [1:948] 1 1 1 0 1 1 1 0 1 0 ...
##  $ Adventure       : num [1:948] 1 1 1 0 1 1 1 1 1 1 ...
##  $ Fantasy         : num [1:948] 1 0 0 0 0 0 0 0 0 0 ...
##  $ Science Fiction : num [1:948] 1 1 1 0 1 1 1 0 1 0 ...
##  $ Drama           : num [1:948] 0 0 0 1 0 0 0 0 0 1 ...
##  $ Romance         : num [1:948] 0 0 0 1 0 0 0 0 0 0 ...
##  $ Animation       : num [1:948] 0 0 0 0 0 0 0 1 0 1 ...
##  $ Comedy          : num [1:948] 0 0 0 0 0 0 0 1 0 0 ...
##  $ Family          : num [1:948] 0 0 0 0 0 0 0 1 0 1 ...
##  $ Thriller        : num [1:948] 0 0 0 0 0 0 0 0 1 0 ...
##  $ Crime           : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
##  $ History         : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Music           : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
##  $ War             : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Mystery         : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Horror          : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Western         : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Documentary     : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
##  $ release_month   : num [1:948] 12 4 12 11 12 4 12 6 6 7 ...
##  $ observation_date: chr [1:948] "2009-12-01" "2019-04-01" "2022-12-01" "1997-11-01" ...
##  $ CPIAUCSL        : num [1:948] 217 255 299 162 238 ...

Preprocess data for Random Forest

rf_data <- movies |>
  mutate(
    log_revenue = log10(revenue),
    log_budget = log10(budget),
  ) |>
  rename(Science_Fiction = `Science Fiction`) |>
  drop_na()  # Ensure no missing values

n <- nrow(rf_data)
train_idx <- sample(1:n, size = 0.8 * n)
train <- rf_data[train_idx, ]
test <- rf_data[-train_idx, ]

Train Random Forest model

rf_model <- randomForest(
  log_revenue ~ 
    log_budget + popularity + runtime + actor_avg_pop + crew_avg_pop + season +
    Adventure + Fantasy + Science_Fiction + Animation + Action + Family + War +
    Thriller + Comedy + Romance + Mystery + Drama + Music + Crime + Horror +
    Western + Documentary + History,
  data = train,
  ntree = 500,
  importance = TRUE
)
print(rf_model)
## 
## Call:
##  randomForest(formula = log_revenue ~ log_budget + popularity +      runtime + actor_avg_pop + crew_avg_pop + season + Adventure +      Fantasy + Science_Fiction + Animation + Action + Family +      War + Thriller + Comedy + Romance + Mystery + Drama + Music +      Crime + Horror + Western + Documentary + History, data = train,      ntree = 500, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 8
## 
##           Mean of squared residuals: 0.03224818
##                     % Var explained: 41.25

Evaluate model

pred <- predict(rf_model, newdata = test)
true <- test$log_revenue

r_squared <- 1 - sum((pred - true)^2) / sum((true - mean(true))^2)
rmse <- sqrt(mean((pred - true)^2))

cat("R-squared:", round(r_squared, 3), "\n")
## R-squared: 0.446
cat("RMSE:", round(rmse, 3), "\n")
## RMSE: 0.152
varImpPlot(rf_model, main = "Random Forest Feature Importance")

Function to predict revenue on original scale using Random Forest

rf_predict <- function(newdata) {
  log_prediction <- predict(rf_model, newdata)
  10^log_prediction # Convert back to original scale
}

Function to plot predictions

evaluate_model_predictions <- function(model, test_data, title_col = "title", revenue_col = "revenue") {
  test <- test_data %>%
    mutate(
      predicted_log_revenue = predict(model, newdata = test_data),
      actual_log_revenue = log10(.data[[revenue_col]]),
      residual = actual_log_revenue - predicted_log_revenue
    )
  
  top100 <- test %>%
    arrange(desc(.data[[revenue_col]])) %>%
    slice_head(n = 100) %>%
    mutate(
      title = fct_reorder(.data[[title_col]], actual_log_revenue),
      label_actual = paste0("Title: ", .data[[title_col]], 
                            "\nActual log-revenue: ", round(actual_log_revenue, 2)),
      label_pred = paste0("Title: ", .data[[title_col]], 
                          "\nPredicted log-revenue: ", round(predicted_log_revenue, 2))
    )
  
  return(top100)
}

plot_prediction_comparison <- function(top100_df) {
  gg <- ggplot(top100_df, aes(x = title)) +
    geom_point(aes(y = actual_log_revenue, text = label_actual), color = "black") +
    geom_point(aes(y = predicted_log_revenue, text = label_pred), color = "red") +
    geom_segment(aes(y = predicted_log_revenue, yend = actual_log_revenue, xend = title), 
                 color = "gray", linetype = "dashed") +
    coord_flip() +
    theme_minimal() +
    labs(
      title = "Actual vs Predicted Log Revenue (Top 100 Grossing Films)",
      x = "Movie Title",
      y = "Log10(Revenue)"
    )
  
  ggplotly(gg, tooltip = "text", height = 1500, width = 900)
}
top100_rf <- evaluate_model_predictions(rf_model, test)
plot_prediction_comparison(top100_rf)
## Warning in geom_point(aes(y = actual_log_revenue, text = label_actual), :
## Ignoring unknown aesthetics: text
## Warning in geom_point(aes(y = predicted_log_revenue, text = label_pred), :
## Ignoring unknown aesthetics: text

Build XGBoost model

Preprocess data for XGBoost

xgb_data <- movies %>%
  mutate(
    log_budget = log10(budget),
    log_revenue = log10(revenue)
  ) %>%
  select(
    -c(movie_id, title, release_date, actors, crews, budget, revenue,
       release_month, release_year, observation_date, CPIAUCSL)
  ) %>%
  drop_na()

# Convert categorical/factor to dummies (season and genres)
xgb_matrix <- model.matrix(~ . - log_revenue, data = xgb_data)[, -1]  # remove intercept

# Labels
xgb_labels <- xgb_data$log_revenue

# Train/test split
set.seed(42)
n <- nrow(xgb_matrix)
train_idx <- sample(1:n, size = 0.8 * n)
xgb_train <- xgb_matrix[train_idx, ]
xgb_test <- xgb_matrix[-train_idx, ]
y_train <- xgb_labels[train_idx]
y_test <- xgb_labels[-train_idx]

Train XGBoost model

xgb_model <- xgboost(
  data = xgb_train,
  label = y_train,
  nrounds = 100,
  objective = "reg:squarederror",
  max_depth = 6,
  eta = 0.1,
  subsample = 0.8,
  colsample_bytree = 0.8,
  verbose = 0
)

Evaluate model

xgb_pred <- predict(xgb_model, newdata = xgb_test)

r_squared_xgb <- 1 - sum((xgb_pred - y_test)^2) / sum((y_test - mean(y_test))^2)
rmse_xgb <- sqrt(mean((xgb_pred - y_test)^2))

cat("XGBoost R-squared:", round(r_squared_xgb, 3), "\n")
## XGBoost R-squared: 0.414
cat("XGBoost RMSE:", round(rmse_xgb, 3), "\n")
## XGBoost RMSE: 0.157

Feature Importance Plot

importance <- xgb.importance(model = xgb_model)
xgb.plot.importance(importance_matrix = importance, top_n = 20)

## Function to predict revenue on original scale using XGBoost

predict_movie_revenue_xgb <- function(newdata_matrix) {
  log_pred <- predict(xgb_model, newdata = newdata_matrix)
  10^log_pred
}
evaluate_model_predictions_xgb <- function(model, test_matrix, original_df, title_col = "title", revenue_col = "revenue") {
  # Make predictions
  predicted_log_revenue <- predict(model, newdata = test_matrix)
  
  # Add predictions back to the original test data
  test <- original_df %>%
    filter(complete.cases(.)) %>%
    slice(-train_idx) %>%  # Use the same test rows
    mutate(
      predicted_log_revenue = predicted_log_revenue,
      actual_log_revenue = log10(.data[[revenue_col]]),
      residual = actual_log_revenue - predicted_log_revenue
    ) %>%
    arrange(desc(.data[[revenue_col]])) %>%
    slice_head(n = 100) %>%
    mutate(
      title = fct_reorder(.data[[title_col]], actual_log_revenue),
      label_actual = paste0("Title: ", .data[[title_col]], 
                            "\nActual log-revenue: ", round(actual_log_revenue, 2)),
      label_pred = paste0("Title: ", .data[[title_col]], 
                          "\nPredicted log-revenue: ", round(predicted_log_revenue, 2))
    )
  
  return(test)
}
top100_xgb <- evaluate_model_predictions_xgb(
  model = xgb_model,
  test_matrix = xgb_test,
  original_df = movies
)

plot_prediction_comparison(top100_xgb)
## Warning in geom_point(aes(y = actual_log_revenue, text = label_actual), :
## Ignoring unknown aesthetics: text
## Warning in geom_point(aes(y = predicted_log_revenue, text = label_pred), :
## Ignoring unknown aesthetics: text